Read income data
# ::::: read income data :::::
incex <- read.table('/Users/youmisuk/Desktop/DS3003/W1 - Syllabus & R intro/income_exmpl.dat', header = T, sep = "\t")
summary(incex)
sex age edu occ
Length:1922 Min. :18.00 Length:1922 Length:1922
Class :character 1st Qu.:31.00 Class :character Class :character
Mode :character Median :42.00 Mode :character Mode :character
Mean :41.69
3rd Qu.:52.00
Max. :65.00
oexp income
Min. : 0.0 Min. : 704
1st Qu.: 8.0 1st Qu.:1117
Median :19.0 Median :1304
Mean :19.5 Mean :1313
3rd Qu.:30.0 3rd Qu.:1506
Max. :48.0 Max. :2115
# change order of factor levels
incex$occ <- factor(incex$occ, levels = c('low', 'med.', 'high'))
incex$edu <- factor(incex$edu, levels = c('low', 'med.', 'high'))
incex$sex <- factor(incex$sex, levels = c('m', 'f'), labels = c('male', 'female'))
attach(incex) # attach data frame (facilitates access to variables)
Barplot
- Simple barplots and pie chart
tab <- table(occ, sex) # table we want to plot
tab
sex
occ male female
low 223 372
med. 394 306
high 452 175
par(mfcol = c(2,2)) # set up multiple frames for plotting (2 rows, 2 columns; filled by columns)
op <- par(mar = c(4.5, 4, 3, 0)) # change the margin of the plot (see help(par))
# old parameter values are stored under "op"
barplot(tab[, 1], xlab = 'Occupational status', ylab = 'Frequency',
col = c('grey90', 'grey60', 'grey30'), main = 'Men')
barplot(tab[, 2], xlab = 'Occupational status', ylab = 'Frequency',
col = c('grey90', 'grey60', 'grey30'), main = 'Women')
par(mar = c(2, 1, 2, 0))
pie(tab[, 1], col = c('grey90', 'grey60', 'grey30'), main = 'Men')
pie(tab[, 2], col = c('grey90', 'grey60', 'grey30'), main = 'Women')

par(op) # sets parameter values back to old values (ie margins as before)
- Stacked barplot
par(mfrow = c(1, 2)) # new multiple frame plot (1 row, 2 columns)
barplot(tab, xlab = 'gender', ylab = 'Frequency',
col = c('grey90', 'grey60', 'grey30'), main = 'Occupational status')
text(rep(.7, 3), (cumsum(c(0, tab[-3, 1])) + cumsum(tab[, 1]))/2,
dimnames(tab)$occ, col = c('black', 'white', 'white'))
text(rep(1.9, 3), (cumsum(c(0, tab[-3, 2])) + cumsum(tab[, 2]))/2,
dimnames(tab)$occ, col = c('black', 'white', 'white'))
ptab <- prop.table(tab, 2)
barplot(ptab, xlab = 'gender', ylab = 'Frequency',
col = c('grey90', 'grey60', 'grey30'), main = 'Occupational status')
text(rep(.7, 3), (cumsum(c(0, ptab[-3, 1])) + cumsum(ptab[, 1]))/2,
dimnames(ptab)$occ, col = c('black', 'white', 'white'))
text(rep(1.9, 3), (cumsum(c(0, ptab[-3, 2])) + cumsum(ptab[, 2]))/2,
dimnames(ptab)$occ, col = c('black', 'white', 'white'))

?text
- Frequency tables & frequency distribution
head(f <- table(age)) # simple frequency table of age
age
18 19 20 21 22 23
12 8 15 18 29 31
# putting the assignment into parentheses assigns &
# simultan. prints the result of expression on the screen
head(p <- f / sum(f)) # relative distribution (proportions)
age
18 19 20 21 22 23
0.006243496 0.004162331 0.007804370 0.009365245 0.015088450 0.016129032
head(p <- prop.table(f)) # alternative way for getting proportions: prop.table()
age
18 19 20 21 22 23
0.006243496 0.004162331 0.007804370 0.009365245 0.015088450 0.016129032
head(cf <- cumsum(f)) # cumulative frequencies
18 19 20 21 22 23
12 20 35 53 82 113
head(cp <- cumsum(p)) # cumulative proportions
18 19 20 21 22 23
0.006243496 0.010405827 0.018210198 0.027575442 0.042663892 0.058792924
# create a complete frequency/distribution table using column-bind (cbind())
# cbind(freq = f, prop = p, cum.freq = cf, cum.prop = cp)
head(cbind(freq = f, prop = p, cum.freq = cf, cum.prop = cp), 10)
freq prop cum.freq cum.prop
18 12 0.006243496 12 0.006243496
19 8 0.004162331 20 0.010405827
20 15 0.007804370 35 0.018210198
21 18 0.009365245 53 0.027575442
22 29 0.015088450 82 0.042663892
23 31 0.016129032 113 0.058792924
24 38 0.019771072 151 0.078563996
25 62 0.032258065 213 0.110822060
26 48 0.024973985 261 0.135796046
27 50 0.026014568 311 0.161810614
# ::::: write a simple function producing a complete frequency table :::::
freq.tab <- function(x)
{
# produces a frequency table for a vector x
f <- table(x)
p <- prop.table(f)
cbind(freq = f, prop = p, cum.freq = cumsum(f), cum.prop = cumsum(p))
}
# freq.tab(age)
head(freq.tab(age), 10)
freq prop cum.freq cum.prop
18 12 0.006243496 12 0.006243496
19 8 0.004162331 20 0.010405827
20 15 0.007804370 35 0.018210198
21 18 0.009365245 53 0.027575442
22 29 0.015088450 82 0.042663892
23 31 0.016129032 113 0.058792924
24 38 0.019771072 151 0.078563996
25 62 0.032258065 213 0.110822060
26 48 0.024973985 261 0.135796046
27 50 0.026014568 311 0.161810614
# ::::: frequency distribution of "age" :::::
# by defining "age" as factor plot() produces a strip chart
# the arguments "space" controls the space between bars
plot(as.factor(age), space = 2, ylab = 'Frequency', xlab = 'Age',
main = 'Age Distribution')

Histogram
# ::::: creating grouped data from "age" :::::
head(age5 <- cut(age, seq(15, 65, by = 5)), 20) # cuts the data into 5y age groups
[1] (60,65] (30,35] (55,60] (60,65] (15,20] (35,40] (35,40] (50,55] (45,50]
[10] (50,55] (50,55] (50,55] (20,25] (55,60] (35,40] (55,60] (30,35] (45,50]
[19] (30,35] (55,60]
10 Levels: (15,20] (20,25] (25,30] (30,35] (35,40] (40,45] (45,50] ... (60,65]
head(age10 <- cut(age, seq(15, 65, by = 10)), 20) # cuts the data into 10y age groups
[1] (55,65] (25,35] (55,65] (55,65] (15,25] (35,45] (35,45] (45,55] (45,55]
[10] (45,55] (45,55] (45,55] (15,25] (55,65] (35,45] (55,65] (25,35] (45,55]
[19] (25,35] (55,65]
Levels: (15,25] (25,35] (35,45] (45,55] (55,65]
(tab5 <- table(age5)) # frequency distribution of age5
age5
(15,20] (20,25] (25,30] (30,35] (35,40] (40,45] (45,50] (50,55] (55,60] (60,65]
35 178 235 237 221 227 230 233 224 102
(tab10 <- table(age10)) # frequency distribution of age10
age10
(15,25] (25,35] (35,45] (45,55] (55,65]
213 472 448 463 326
# use our freq.tab() function for investigating the frequency distribution
# of the grouped data
freq.tab(age5)
freq prop cum.freq cum.prop
(15,20] 35 0.01821020 35 0.0182102
(20,25] 178 0.09261186 213 0.1108221
(25,30] 235 0.12226847 448 0.2330905
(30,35] 237 0.12330905 685 0.3563996
(35,40] 221 0.11498439 906 0.4713840
(40,45] 227 0.11810614 1133 0.5894901
(45,50] 230 0.11966701 1363 0.7091571
(50,55] 233 0.12122789 1596 0.8303850
(55,60] 224 0.11654527 1820 0.9469303
(60,65] 102 0.05306972 1922 1.0000000
freq.tab(age10)
freq prop cum.freq cum.prop
(15,25] 213 0.1108221 213 0.1108221
(25,35] 472 0.2455775 685 0.3563996
(35,45] 448 0.2330905 1133 0.5894901
(45,55] 463 0.2408949 1596 0.8303850
(55,65] 326 0.1696150 1922 1.0000000
# ::::: plot these data is using hist() :::::
par(mfrow = c(2,2))
# frequency plots
hist(age, seq(15, 65, by = 5), xlab = 'Age', main = '', col = 'grey90')
hist(age, seq(15, 65, by = 10), xlab = 'Age', main = '', col = 'grey90')
# density plots (prob = T)
hist(age, seq(15, 65, by = 5), xlab = 'Age', main = '', col = 'grey90', prob = T)
hist(age, seq(15, 65, by = 10), xlab = 'Age', main = '', col = 'grey90', prob = T)

Kernel Density Estimation
# ::::: write our own function kden() :::::
kden <- function(x, h, x.val = seq(min(x)-2*h, max(x)+2*h, length = 200)) {
# kernel density with uniform (rectangular) kernel
# calculates density at deliberately chosen values x.val
# x ... numeric data vector
# h ... bandwidth of uniform kernel
# x.val ... numeric vector of values at which the density should be estimated
x.val <- sort(unique(x.val)) # sorts the unique values of x
n <- length(x) # number of observations
# define a kernel function dens.f() inside the kden() function
dens.f <- function(x.v) { # fct. computes the density at x.v
z <- (x.v - x) / h # standardization
d <- ifelse(z > -1 & z < 1, .5, 0) # computes kernel density, either .5 or 0
sum(d) / n / h # compute density
}
dens <- sapply(x.val, dens.f)
data.frame(x = x.val, y = dens) # combine output into a data frame and returns it
}
# compute the kernel density estimate for income with a bandwidth of 100 EUR
den <- kden(incex$income, h = 100)
head(den, 10)
x y
1 504.0000 0
2 513.1005 0
3 522.2010 0
4 531.3015 0
5 540.4020 0
6 549.5025 0
7 558.6030 0
8 567.7035 0
9 576.8040 0
10 585.9045 0
den$x <- den$x - 100 # for a correct step function (use oexp to clearly see it)
plot(den, type = 's', lwd = 3, xlab = 'Income', ylab = 'Density',
main = 'Income Distribution')
rug(incex$income, ticksize = .02) # adds a "rug" plot
# ::::: use built-in function in R: density() :::::
plot(density(incex$income), main = 'Income Distribution')
# combination of histogram and kernel density estimation
hist(incex$income, prob = T, xlab = 'Income', main = 'Income Distribution')
lines(density(incex$income), lwd = 3, col = 'red')

Boxplots
# ::::: computation of quantiles :::::
quantile(age, prob = c(0, .25, .50, .75, 1)) # quartiles
0% 25% 50% 75% 100%
18 31 42 52 65
quantile(age, prob = seq(.0, 1, by = .2)) # quintiles
0% 20% 40% 60% 80% 100%
18 29 37 46 54 65
quantile(age, prob = seq(.0, 1, by = 1/7)) # septiles
0% 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 100%
18 27 33 39 45 51 57 65
quantile(age, prob = seq(.0, 1, by = .1)) # deciles
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
18 25 29 33 37 42 46 50 54 58 65
par(mfrow=c(1,2))
boxplot(age, horizontal = T, main="Age")
boxplot(income, horizontal = T, main="Income")

boxplot(income ~ edu + sex, col = rep(c('blue', 'red'), each = 3), ylab = 'Income', names = c('male\nlow', 'male\nmed.', 'male\nhigh', 'female\nlow', 'female\nmed.', 'female\nhigh'), main = 'Income by gender and educational level')
# boxplot with notches and variable width
# notches indicate a 95% confidence interval for the median
# the variable width of the boxes reflect differences in sample size
boxplot(income ~ edu + sex, col = rep(c('blue', 'red'), each = 3), ylab = 'Income', names = c('male\nlow', 'male\nmed.', 'male\nhigh', 'female\nlow', 'female\nmed.', 'female\nhigh'), main = 'Income by gender and educational level', notch = T, varwidth = T)

Quantile-comparison plots
(A) Comparison of densities
# way 1
curve(dnorm, -4, 4, lwd = 3, bty = 'L', ylab = 'Density',
xlab = 'Standard Normal Distr. / Income (Standardized)') # theoretical distribution (standard normal)
lines(density(scale(incex$income)), col = 'red', lwd = 3, lty = 2)
legend('topright', c('Income', 'Normal Distr.'), bty = 'n',
col = c('red', 'black'), lty = c(2, 1), lwd = 3)
# way 2
plot(density(scale(incex$income)), col = 'red', lwd = 3, lty = 2, xlim=c(-4, 4), ylim=c(0, 0.4), main="",
bty = 'L', ylab = 'Density', xlab = 'Standard Normal Distr. / Income (Standardized)')
curve(dnorm, -4, 4, lwd = 3, add=T) # theoretical distribution (standard normal)
legend('topright', c('Income', 'Normal Distr.'), bty = 'n',
col = c('red', 'black'), lty = c(2, 1), lwd = 3)

(B) Quantile-comparison plots
my own function, qq
# ::::: writing our own function qq() :::::
qq <- function(x, ...)
{
# computes empirical and normal quantiles for a numeric vector x
n <- length(x) # number of observations
x <- sort(x) # rank-ordered data
eq <- scale(x) # standardized empirical quantiles
p <- (1:n - .5) / n # cumulative proportions
tq <- qnorm(p) # theoretical (normal) quantiles
plot(tq, x, main = 'Quantile-Comparison Plot', ...)
lines(tq, mean(x) + tq * sd(x), lwd = 2)
invisible(data.frame(x = x, eq = eq, p = p, tq = tq)) # returns a data frame
}
qq.dat <- qq(income)
head(qq.dat) # first 6 quantiles
x eq p tq
1 704 -2.383548 0.0002601457 -3.470086
2 715 -2.340506 0.0007804370 -3.163121
3 720 -2.320941 0.0013007284 -3.011284
4 726 -2.297463 0.0018210198 -2.907609
5 727 -2.293550 0.0023413111 -2.828093
6 732 -2.273986 0.0028616025 -2.763232
tail(qq.dat) # last 6 quantiles
x eq p tq
1917 1983 2.621101 0.9971384 2.763232
1918 2002 2.695447 0.9976587 2.828093
1919 2025 2.785445 0.9981790 2.907609
1920 2033 2.816748 0.9986993 3.011284
1921 2071 2.965440 0.9992196 3.163121
1922 2115 3.137609 0.9997399 3.470086
# QQ-plot with standardized empirical quantiles
# NOTE: plots only differ in the scale of the y-axis
plot(qq.dat$tq, qq.dat$eq)
abline(a = 0, b = 1) # add reference line

qqPlot from package car
par(mfrow=c(1,3))
# ::::: use the qqPlot() function from the car library :::::
library(car)
Loading required package: carData
qqPlot(income, col = 1, cex = .7)
[1] 1493 1190
qqPlot(income, col = 1, cex = .7, distribution = "chisq", df=3)
[1] 1493 1190
qqPlot(rchisq(1000, df=3), col = 1, cex = .7, distribution = "chisq", df=3)

[1] 101 675
qqnorm
qqnorm(income) # standard QQ-plot from the "stats" package

Scatterplots
- Simple scatterplot of “age” and “occupational experience”
- without and with jittering
par(mfrow=c(1,3))
# :::: simple scatterplot of "age" and "occupational experience" :::::
par(mar = c(4.5, 4.5, 1, 1))
plot(age, oexp, xlab = 'Age', ylab = 'Occupational Experience', cex = .4, pch = 16)
plot(jitter(age), jitter(oexp), xlab = 'Age', ylab = 'Occupational Experience',
cex = .4, pch = 16) # with default jittering
plot(jitter(age, factor = 3), jitter(oexp, factor = 3), # more jittering: factor = 3
xlab = 'Age', ylab = 'Occupational Experience', cex = .4, pch = 16)

- changing the degree of transparency
par(mfrow=c(1,3))
for (i in c(0.2, 0.5, 0.8)) {
plot(jitter(incex$age, factor = 3), jitter(incex$oexp, factor = 3),
xlab = 'Age', ylab = 'Occupational Experience', cex = .4, pch = 16,
col=rgb(red = 0, green = 0, blue = 0, alpha = i), main=paste("alpha =", i))
}

library(scales)
par(mfrow=c(1,3))
for (i in c(0.2, 0.5, 0.8)) {
plot(jitter(incex$age, factor = 3), jitter(incex$oexp, factor = 3),
xlab = 'Age', ylab = 'Occupational Experience', cex = .4, pch = 16,
col=alpha("black", i), main=paste("alpha =", i))
}

- Simple scatterplot of “occ” and “occupational experience”
# for categorical variable (x-axis)
plot(oexp, occ,
xlab = 'Occupational Experience', ylab = 'Occupational Status',
cex = .4, pch = 16)
plot(oexp, occ,
xlab = 'Occupational Experience', ylab = 'Occupational Status',
cex = .4, pch = 16, yaxt = 'n')
axis(2, 1:3, levels(occ))
plot(jitter(oexp, factor = 3), jitter(as.numeric(occ), factor = 2),
xlab = 'Occupational Experience', ylab = 'Occupational Status',
cex = .4, pch = 16, yaxt = 'n')
axis(2, 1:3, levels(occ))

par(mfrow=c(1,2))
plot(jitter(as.numeric(edu), factor = 2), jitter(as.numeric(occ), factor = 2),
xlab = 'Educational Level', ylab = 'Occupational Status',
cex = .4, pch = 16, xaxt = 'n', yaxt = 'n')
axis(1, 1:3, levels(edu))
axis(2, 1:3, levels(occ))
boxplot(oexp ~ occ, data = incex, horizontal = T,
xlab = 'Occupational Experience', ylab = 'Occupational Status')

- Scatterplot-matrix
plot(incex[, c('income', 'oexp', 'age', 'edu')], cex = .2)

# from the "car"-package
library(car)
scatterplotMatrix(incex[, c('income', 'oexp', 'age', 'edu')], cex = .3)

- Conditioning plots
# coplot()
coplot(income ~ oexp | edu, data = incex)

coplot(income ~ oexp | edu * occ, data = incex, cex = .6)
# xyplot() from the "lattice"-package
library(lattice)
Attaching package: 'lattice'
The following object is masked _by_ '.GlobalEnv':
qq

xyplot(income ~ oexp, data = incex)

xyplot(income ~ oexp | edu, data = incex)

xyplot(income ~ oexp | edu * occ, data = incex, cex = .4)

- Scatterplots with color and shape separators
# color separator
plot(incex$age, incex$oexp, xlab = 'Age', ylab = 'Occupational
Experience', cex = .4, pch = 16, col=c("red", "green")[as.numeric(sex)])
legend('topleft', c('Male', 'Female'), bty = 'n',
col = c('red', 'green'), pch = c(16, 16))
# shape separator
plot(incex$age, incex$oexp, xlab = 'Age', ylab = 'Occupational
Experience', cex = .4, pch = c(1, 3)[as.numeric(sex)])
legend('topleft', c('Male', 'Female'), bty = 'n', pch = c(1, 3))

- 3-dim scatterplot
cloud(income ~ oexp * sex, data = incex, col = as.numeric(sex)+1)

detach(incex) # detaches the attached data frame "incex"